import numpy as np
import pandas as pd
#import datetime
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
df = pd.read_csv("https://raw.githubusercontent.com/SnehaBaranwal/snehab2412.github.io/master/casestudy.csv")
df.head()
| Unnamed: 0 | customer_email | net_revenue | year | |
|---|---|---|---|---|
| 0 | 0 | nhknapwsbx@gmail.com | 249.92 | 2015 |
| 1 | 1 | joiuzbvcpn@gmail.com | 87.61 | 2015 |
| 2 | 2 | ukkjctepxt@gmail.com | 168.38 | 2015 |
| 3 | 3 | gykatilzrt@gmail.com | 62.40 | 2015 |
| 4 | 4 | mmsgsrtxah@gmail.com | 43.08 | 2015 |
df.isnull().sum() #Checking the count of non-available (NaN) values
Unnamed: 0 0 customer_email 0 net_revenue 0 year 0 dtype: int64
We can see there are no not available values in the table which is good.
#Total revenue for the current year
df1 = df.groupby('year')['net_revenue'].sum()
print(df1)
year 2015 29036749.19 2016 25730943.59 2017 31417495.03 Name: net_revenue, dtype: float64
We can see the total revenue of the years - 2015, 2016 and 2017
#New Customer Revenue e.g. new customers not present in previous year only
##We can see that the customer email is the only unique identifier for the customer and that's why we are going to use that to
##check if the customer is new.
dfFor2015 = df.loc[df.year == 2015]
dfFor2016 = df.loc[df.year == 2016]
dfFor2017 = df.loc[df.year == 2017]
emailList2015 = dfFor2015.customer_email.tolist()
emailList2016 = dfFor2016.customer_email.tolist()
emailList2017 = dfFor2017.customer_email.tolist()
Custdifference_2016 = list(set(emailList2016) - set(emailList2015))
Custdifference_2017 = list(set(emailList2017) - set(emailList2016))
dfcustomers2016 = df.loc[df.customer_email.isin(Custdifference_2016)]
dfcustomers2016Exact = dfcustomers2016.loc[dfcustomers2016.year==2016]
print("The net revenue for 2016 customers only is",dfcustomers2016Exact['net_revenue'].sum())
dfcustomers2017 = df.loc[df.customer_email.isin(Custdifference_2017)]
dfcustomers2017Exact = dfcustomers2017.loc[dfcustomers2017.year==2017]
print("The net revenue for 2017 customers only is",dfcustomers2017Exact['net_revenue'].sum())
The net revenue for 2016 customers only is 18245491.01 The net revenue for 2017 customers only is 28776235.039999995
df_total_revenue_2015 = df.loc[df['year'] == 2015, 'net_revenue'].sum()
print(df_total_revenue_2015)
df_total_revenue_2016 = df.loc[df['year'] == 2016, 'net_revenue'].sum()
print(df_total_revenue_2016)
df_total_revenue_2017 = df.loc[df['year'] == 2017, 'net_revenue'].sum()
29036749.189999994 25730943.59
#Existing Customer Growth
existing_customer_growth_2016 = df_total_revenue_2016 - df_total_revenue_2015
existing_customer_growth_2017 = df_total_revenue_2017 - df_total_revenue_2016
print("The existing customer growth for 2016 is",existing_customer_growth_2016)
print("The existing customer growth for 2017 is",existing_customer_growth_2017)
The existing customer growth for 2016 is -3305805.599999994 The existing customer growth for 2017 is 5686551.440000016
#Revenue lost from attrition (Loss of customer revenue from customers who were in the previous year but not in the next year)
df_year_2015 = df.loc[df.year == 2015]
#print(cust_2015)
df_year_2016 = df.loc[df.year == 2016]
df_year_2017 = df.loc[df.year == 2017]
#Converting the customer email of the years into a list as it is easy to find out the data difference between the years.
l_2015 = df_year_2015.customer_email.tolist()
l_2016 = df_year_2016.customer_email.tolist()
l_2017 = df_year_2017.customer_email.tolist()
attritionLoss2016 = df_year_2015.loc[df_year_2015.customer_email.isin(l_2016)==False].net_revenue.sum()
print("The attrition loss in 2016 is",attritionLoss2016)
attritionLoss2017 = df_year_2016.loc[df_year_2016.customer_email.isin(l_2017)==False].net_revenue.sum()
print("The attrition loss in 2017 is",attritionLoss2017)
The attrition loss in 2016 is 21571632.070000004 The attrition loss in 2017 is 23110294.939999998
#Existing Customer Revenue Current Year that is 2017
existing_cus_revenue_2015 = df.loc[df['year'] == 2015]['net_revenue'].sum()
print("The existing customer revenue for 2015 is",existing_cus_revenue_2015)
#Existing Customer Revenue Prior Year that is 2016
existing_cus_revenue_2016 = df.loc[df['year'] == 2016]['net_revenue'].sum()
print("The existing customer revenue for 2016 is",existing_cus_revenue_2016)
#Existing Customer Revenue Prior Year that is 2015
existing_cus_revenue_2017 = df.loc[df['year'] == 2017]['net_revenue'].sum()
print("The existing customer revenue for 2017 is",existing_cus_revenue_2017)
The existing customer revenue for 2015 is 29036749.189999994 The existing customer revenue for 2016 is 25730943.59 The existing customer revenue for 2017 is 31417495.030000016
#Existing Customer Revenue Prior Year
#Total Customers Current Year i.e., 2017
NumberOfCus2017 = df.loc[df['year'] == 2017]['customer_email'].count()
print("The total number of customers in the current year 2017 is",NumberOfCus2017)
#Total Customers Previous Year i.e., 2016
NumberOfCus2016 = df.loc[df['year'] == 2016]['customer_email'].count()
print("The total number of customers in the previous year 2016 is",NumberOfCus2016)
#Total Customers Previous Year i.e., 2015
NumberOfCus2015 = df.loc[df['year'] == 2015]['customer_email'].count()
print("The total number of customers in previous year 2015 is",NumberOfCus2015)
The total number of customers in the current year 2017 is 249987 The total number of customers in the previous year 2016 is 204646 The total number of customers in previous year 2015 is 231294
#New Customers 2017
newCustomers2017 = NumberOfCus2017 - NumberOfCus2016
newCustomers2017
45341
#Customers Lost
customerLost = NumberOfCus2016 - NumberOfCus2015
customerLost
-26648
#fig = sns.boxplot(x="year", y="net_revenue", data=df)
revenue= df.groupby('year')['net_revenue'].sum()
revenue
year 2015 29036749.19 2016 25730943.59 2017 31417495.03 Name: net_revenue, dtype: float64
year= df['year'].unique()
year
array([2015, 2016, 2017], dtype=int64)
fig = px.box(df, x='year', y='net_revenue')
fig.show()
Here we can see that for all the years 2015, 2016, 2017, the quartiles and the median are almost same for every year and has not changed overall in the net_revenue.
s = df['customer_email'].value_counts() ## Counts the occurrence of unqiue elements and stores in a variable called "s" which is series type
#new = pd.DataFrame({'loan_status':s.index, 'Count':s.values})
s=df.groupby('year')['customer_email'].count()
s
year 2015 231294 2016 204646 2017 249987 Name: customer_email, dtype: int64
yr=df['year'].unique()
yr
array([2015, 2016, 2017], dtype=int64)
import pandas as pd
import plotly.express as px
import statsmodels.formula.api as smapi
import numpy as np
fig = px.bar(df, yr, s, color=df['year'].unique())
fig.update_layout(xaxis={'dtick': 1})
fig.show()
fig = px.line(df, x=yr, y=revenue, markers='o')
fig.update_layout(xaxis={'dtick': 1})
fig.show()
Here, through this graph it becomes interesting. We can see a major dip in the yearly net_revenue even though the median and the quartiles for all the years 2015, 2016 and 2017 are nearly same. It is good though, as the revenue increased in the last year of 2017 after the dip in 2016.